require(ggplot2)

Guidelines

check_violations <- function(sample_size, orig_new_obsv, violation_name, violation_func, sigma_sq, orig_X) {

  # ordinary relationship with no violations
  if (violation_name == "None") {
    X <- orig_X
    new_obsv <- orig_new_obsv
    eps <- rnorm(sample_size, mean = 0, sd = sqrt(sigma_sq))
    return(list(X,new_obsv,eps))
  }
  if (violation_name == "Nonlinear relationship") {
    X <- violation_func(orig_X)
    new_obsv <- violation_func(orig_new_obsv)
    eps <- rnorm(sample_size, mean = 0, sd = sqrt(sigma_sq))
    return(list(X,new_obsv,eps))
  }
  if (violation_name == "Non-normal errors") {
    # gamma distributed errors
    if (violation_func == "Gamma") {
      X <- orig_X
      new_obsv <- orig_new_obsv
      eps_alpha <- sigma_sq
      eps_beta <- 1
      eps <- (rgamma(sample_size, shape = eps_alpha, scale = eps_beta) - eps_alpha * eps_beta) * 2
      return(list(X,new_obsv,eps))
    }
    # poisson distributed errors
    if (violation_func == "Poisson") {
      X <- orig_X
      new_obsv <- orig_new_obsv
      lambda <- sigma_sq
      eps <- (rpois(sample_size, lambda) - lambda) * 2
      return(list(X,new_obsv,eps))
    }
  }
  if (violation_name == "Heterogeneous variances") {
    X <- orig_X
    new_obsv <- orig_new_obsv
    eps <- rnorm(sample_size, mean = 0, sd = violation_func(X))
    return(list(X,new_obsv,eps))
  }
  if (violation_name == "Correlated Errors") {
    X <- orig_X
    rho <- violation_func
    new_obsv <- orig_new_obsv
    eps <- rep(0, sample_size)
    e.ind <- rnorm(sample_size, mean = 0, sd = (sigma_sq / sqrt(1-rho^2)))
    eps[1] <- e.ind[1]
    for (i in 2:sample_size) {
      eps[i] <- rho * eps[i-1] + e.ind[i]
    }
    return(list(X,new_obsv,eps))
  }
    if (violation_name == "Multicolinearity") {
    X <- unname(cbind(orig_X[,1],violation_func(orig_X[,1])))
    new_obsv<- unname(cbind(orig_new_obsv[,1],violation_func(orig_new_obsv[,1])))
    eps <- rnorm(sample_size, mean = 0, sd = sqrt(sigma_sq))
    return(list(X,new_obsv,eps))
  }
}


# function that will run simulations, violating a specific assumption
run_simulations <- function(num_runs, sample_size, num_new_obsv, violation, violation_func, num_predictors, sigma_sq) {
  
  sim_results <- data.frame(percent_yhat_bad_confint        = 0,
                            percent_intercept_bad_confint   = 0, 
                            percent_param_bad_confint       = 0, 
                            average_mse                     = 0, 
                            percent_insignificant_intercept = 0, 
                            percent_insignificant_param     = 0) 

  parameters <- sample.int(10,num_predictors + 1)
  for (i in 1:num_runs) {
    
    if (num_predictors == 2) {
      X <- cbind(runif(sample_size, 0, 10),runif(sample_size, 10, 20))
      new_obsv <- cbind(runif(num_new_obsv, 0, 10),runif(num_new_obsv, 10, 20))
    } else {X <- runif(sample_size, 0, 10)
            new_obsv <- runif(num_new_obsv, 0, 10)}
    
    
    # transform our data to violate assumptions of linearity
    violated_data <- check_violations(sample_size, new_obsv, violation, violation_func, sigma_sq, X)
    violated_X <- violated_data[[1]]
    violated_new_obsv <- violated_data[[2]]
    eps <- violated_data[[3]]

    # TODO: make this not break plot_diagnostics
    if (violation == "Multicolinearity") {
      X <- violated_X
      new_obsv <- violated_new_obsv
    }

    # add in column of ones to capture beta0 when generating Y and expected value of Y
    Y <- (cbind(1,violated_X) %*% parameters) + eps
    exp_val_Y <- cbind(1,violated_new_obsv) %*% parameters
    
    # fit a linear model using the simulated data
    lm_data <- data.frame(Y = Y, X = I(X))
    my_lm <- lm(Y ~ X, data=lm_data)
    
    # predict y for new values of x using the fitted linear model
    Y_hat_ci <- predict(my_lm, newdata = data.frame(X = I(new_obsv)), interval = "confidence")
    
    # check whether the confidence interval actually captures the expected value of y from the true model
    n_bad_predictions <- num_new_obsv - sum((exp_val_Y >= Y_hat_ci[,2]) & (exp_val_Y <= Y_hat_ci[,3]))
# freq of confint for prediction of new data points not capturing the expected val of Y based on ground truth
    sim_results$percent_yhat_bad_confint <- sim_results$percent_yhat_bad_confint + n_bad_predictions
    
    # get confidence iterval for beta hat
    beta_ci <- confint(my_lm)
    
    # check whether the true value for the betas is caputured in the confidence interval
    good_betas <- ((parameters >= beta_ci[,"2.5 %"]) & (parameters <= beta_ci[,"97.5 %"]))
    if (good_betas[1] == FALSE)

# how many times true beta_0 was not captured by the confint for our fitted model
      sim_results$percent_intercept_bad_confint <- sim_results$percent_intercept_bad_confint + 1
    # good_betas[-1] means anything that is not the intercept coeff
    if (sum(good_betas[-1] == FALSE) >= 1) 
# bad predictor rate: how many times a parameter that was not b0 was not within the confint
      # if there's more than one they are combined
      {sim_results$percent_param_bad_confint <- sim_results$percent_param_bad_confint + sum(good_betas[-1] == FALSE)}
    
    # add the mse to the sum of MSE for all runs (to be averaged at the end)
    sim_results$average_mse <- sim_results$average_mse + (anova(my_lm)$'Mean Sq'[2]) # [2] for residual
    
    
    # check which beta_hats are significant based on the p-values
    p_vals <- summary(my_lm)$coeff[,4] # 4 for pvals
    if (p_vals[1] > .05)
# frequency % p-val(intercept) was >0.05      
      sim_results$percent_insignificant_intercept <- sim_results$percent_insignificant_intercept + 1
    if (sum(p_vals[-1] > .05) >= 1)
      
# excluding intercept coeff, freq that params for predictors had p-val > 0.05 in all our sims. on interval (0,1)
      # sum in case more than one predictor var
      {sim_results$percent_insignificant_param <- sim_results$percent_insignificant_param + sum(p_vals[-1] > .05)}
  }
  
  # FINALLY, take mean of all these 
  sim_results <- sim_results/num_runs
  
  # confint for predictions did not capture expected value, divided by n new data points 
  sim_results$percent_yhat_bad_confint <- sim_results$percent_yhat_bad_confint/(num_new_obsv)
  
  # conint for predictor param did not capture true param used to generate data, divited by number of predictors
  sim_results$percent_param_bad_confint <- sim_results$percent_param_bad_confint/(num_predictors)
  
  # precent of (non-intercept) params that whose p-val > 0.05
  sim_results$percent_insignificant_param <- sim_results$percent_insignificant_param/(num_predictors)

  # R is terrible and we have to return our objects as a list >:C
  payload <- list(sim_results=sim_results, lm_model=my_lm, X=X, Y=Y, eps=eps, Y_hat_ci=Y_hat_ci, exp_val_Y=exp_val_Y)
  return(payload)
}
plot_diagnostics <- function(payload) {
  # unpack payload
  list2env(payload, envir=environment())
  
   # TODO check dims of X to make sure that we don't get errors inc ase of multicollinearity (more than one predictor var)
  # TODO if so, call pairs()
  
  writeLines("Diagnostic #1: Is the Regression Function Linear? ")
  plot(Y ~ X)
  abline(lm_model)

  writeLines("Diagnostic #2: Do the Residuals Have Constant Variance?\n\n Does the plot appear to be a general cloud of points or is there a megaphone shape?")
  plot(lm_model, which = 1) 
  
  writeLines("Diagnostic #3: Are the Residuals Normally Distributed?")
  plot(lm_model, which = 2)
  
  writeLines("Diagnostic #4: Are the Residuals Independent?")  
  plot(1:length(X), residuals(lm_model))
  
  if(num_predictors > 1){
    
      multivariate_dataframe <- as.data.frame(cbind(X, Y))
  colnames(multivariate_dataframe) <- c("X1", "X2", "Y")
multivariate_lm_model <- lm(Y~., data = multivariate_dataframe)
    
    
    
    writeLines("Diagnostic #5: Is there Evidence of Multicollinearity?")
    pairs(multivariate_dataframe)
  }

}
summary_diagnostics <- function(payload) {
  # unpack payload
  list2env(payload, envir=environment())

  print(summary(lm_model))
  print(anova(lm_model))
}
advanced_diagnostics <- function(payload) {
  # unpack payload
  list2env(payload, envir=environment())
  
  # writeLines("Diagnostic #1: Is the Regression Function Linear? ")
  # print(anova(lm_model))
  
  writeLines("Diagnostic #2: Do the Residuals Have Constant Variance?\n\n Does the plot appear to be a general cloud of points or is there a megaphone shape?")

  writeLines("Large values of the test statistic lead to us rejecting H0 and concluding that we have nonconstant variance (ie heteroskedacity).")
  
  
  print(bptest(Y ~ X))
  
  writeLines("Diagnostic #3: Are the Residuals Normally Distributed?")
  # plot(lm_model, which = 2)
  print(shapiro.test(residuals(lm_model)))

  
  if(num_predictors > 1){
  writeLines("Diagnostic #5: Is there Evidence of Multicollinearity?")  
  writeLines("Here are some rough guidelines:
  VIF_j≈1: no evidence of multicollinearity for x_j
  1<VIF_j<5: evidence of moderate multicollinearity for x_j
  5≤VIF_j<10: evidence of strong multicollinearity for x_j
  VIF_j≥10 evidence of severe multicollinearity for x_j, will almost certainly cause problems")
  

  multivariate_dataframe <- as.data.frame(cbind(X, Y))
  colnames(multivariate_dataframe) <- c("X1", "X2", "Y")
multivariate_lm_model <- lm(Y~., data = multivariate_dataframe)


  print(vif(multivariate_lm_model))
  }

  

}

Questions

  1. Nonlinear relationship
    1. Simulate data for a variety of different non-linear relationships (e.g. polynomial, exponential, sinusoidal).
    2. Try simulations with a small sample size (e.g. 20), a medium sample size (e.g. n= 100), and a large sample size (e.g. n = 5000).
    3. For each simulation,
      1. Predict y-hat at several different locations using a confidence interval.
      2. Predict the beta coefficients for a linear model using a confidence interval.
      3. Find the MSE (to estimate sigma^2)
      4. Test to see whether the beta(s) are significant
    4. Which of the above tasks were affected by the nonlinear relationship?
    5. After you have experimented with the effects of different model structures, true parameter values, and sample sizes, let’s repeat the simulation but test yourself to see whether you can detect non-linearity.
      1. Have R randomly choose whether to simulate data from a true linear model or a true nonlinear model.
      2. Simulate data accordingly and display informal/ formal diagnostics as appropriate.
      3. Based on the diagnostics, predict whether the problem areas you mentioned in part d will be affected or not. (Note: You are not predicting whether the assumptions are violated– just whether they are violated to such an extent that your ability to use the model is compromised)

We predict that the smaller models will be more likely to be affected by a nonlinear relationship. With large sample sizes, the normality assumption is not critical unless you are predicting new observations. But with smaller sample sizes, a violation of nonnomrality is more likely to affect the model. And since we only have 10 observations in the smaller simluation, the nonnormality assumption is especially difficult to detect because we expect random variation in small sample sizes anyways.

# n_sims <- 100
# n_samps <- 100
# num_new_obsv <- 10
# violation_type <- "Nonlinear relationship"
# violation_func <- sin
# num_predictors <- 1
# sigma_sq <- 1
# 
# payload <- run_simulations(n_sims, n_samps, num_new_obsv, violation_type, violation_func, num_predictors, sigma_sq)
# cat(violation_type, n_sims, n_samps, num_new_obsv, str(violation_func), num_predictors, sigma_sq)
# plot_diagnostics(payload)

Aside: Many of the issues you end up facing with a nonlinear relationship can also be seen if an important predictor is excluded from the model. If you have extra time, feel free to play with this issue as well (optional).

# set up constants for simulations
set.seed(1)
n_small <- 20
n_med <- 100
n_large <- 5000
num_runs <- 100
num_new_obsv <- 20
num_predictors <- 1
sigma_sq <- 4

PARTS A, B, & C

POLYNOMIAL

poly_func <- function(x) (x-5)^2
poly_small_results <- run_simulations(num_runs,
                                      n_small, 
                                      num_new_obsv, 
                                      violation = "Nonlinear relationship", 
                                      violation_func = poly_func, 
                                      num_predictors, 
                                      sigma_sq)

poly_med_results <-run_simulations(num_runs,
                                   n_med, 
                                   num_new_obsv, 
                                   violation =  "Nonlinear relationship", 
                                   violation_func = poly_func, 
                                   num_predictors, 
                                   sigma_sq)

poly_large_results <- run_simulations(num_runs, 
                                     n_large, 
                                     num_new_obsv, 
                                     violation =  "Nonlinear relationship", 
                                     violation_func = poly_func, 
                                     num_predictors, 
                                     sigma_sq)

EXPONENTIAL

exp_small_results <- run_simulations(num_runs,
                                     n_small, 
                                     num_new_obsv, 
                                     violation =  "Nonlinear relationship", 
                                     violation_func = exp, 
                                     num_predictors, 
                                     sigma_sq)

exp_med_results <-run_simulations(num_runs, 
                                  n_med, 
                                  num_new_obsv, 
                                  violation =  "Nonlinear relationship",
                                  violation_func = exp, 
                                  num_predictors, 
                                  sigma_sq)

exp_large_results <-run_simulations(num_runs,
                                    n_large, 
                                    num_new_obsv, 
                                    violation =  "Nonlinear relationship", 
                                    violation_func = exp, 
                                    num_predictors, 
                                    sigma_sq)

SINUSOIDAL

sin_small_results <- run_simulations(num_runs, 
                                     n_small, 
                                     num_new_obsv, 
                                     violation =  "Nonlinear relationship", 
                                     violation_func = sin, 
                                     num_predictors, 
                                     sigma_sq)

sin_med_results <- run_simulations(num_runs, 
                                   n_med, 
                                   num_new_obsv, 
                                   violation =  "Nonlinear relationship", 
                                   violation_func = sin, 
                                   num_predictors, 
                                   sigma_sq)

sin_large_results <- run_simulations(num_runs, 
                                     n_large, 
                                     num_new_obsv, 
                                     violation =  "Nonlinear relationship", 
                                     violation_func = sin, 
                                     num_predictors, 
                                     sigma_sq)

LOGARITHMIC

log_small_results <- run_simulations(num_runs, 
                                     n_small, 
                                     num_new_obsv, 
                                     violation =  "Nonlinear relationship", 
                                     violation_func = log,
                                     num_predictors, 
                                     sigma_sq)

log_med_results <- run_simulations(num_runs, 
                                   n_med, 
                                   num_new_obsv, 
                                   violation =  "Nonlinear relationship", 
                                   violation_func = log, 
                                   num_predictors, 
                                   sigma_sq)

log_large_results <- run_simulations(num_runs, 
                                     n_large, 
                                     num_new_obsv, 
                                     violation =  "Nonlinear relationship", 
                                     violation_func = log, 
                                     num_predictors, 
                                     sigma_sq)

QUESTION TEMPLATES

TODO template for part D Prediction confidence intervals
Confidence interval for \(\beta_0\)
Confidence inverval for \(\beta_1\)
Average MSE
P-values for \(\beta_0\) and \(\beta_1\)

TODO template for part E

We used R to choose whether to simulate data from a true linear model or a true nonlinear model. And then we simulated data accordingly and displayed diagnostics as appropritate.

From this simulation, we can tell that a linear function was generated.

Based on the diagnostics, we predict that the problem areas that we mentioned in part d will/will not be affected.

PART D

(polynomial_results <- rbind(small_samp = poly_small_results[[1]], 
                            med_samp = poly_med_results[[1]], 
                            large_samp = poly_large_results[[1]]))
(exponential_results <- rbind(small_samp = exp_small_results[[1]],
                             med_samp = exp_med_results[[1]], 
                             large_samp = exp_large_results[[1]]))
(sinusoidal_results <- rbind(small_samp = sin_small_results[[1]], 
                            med_samp = sin_med_results[[1]], 
                            large_samp = sin_large_results[[1]]))
(logarithmic_results <- rbind(small_samp = log_small_results[[1]], 
                             med_samp = log_med_results[[1]], 
                             large_samp = log_large_results[[1]]))

Prediction confidence intervals
Each different type of non-linear relationship severely affected the quality of the predictions, and the quality continued to decrease as sample size increased.

Confidence interval for \(\beta_0\)
The confidence interval for the intercept failed to capture the true intercept for most of the simulations and it’s accuracy in capturing it decreased significantly as the sample size increased. For each type of non-linear relationship we tested, the confidence interval for \(\beta_0\) failed to capture the true value of the intercept for every single simulation when the sample size was set to 5000.

Confidence inverval for \(\beta_1\)
The confidence interval for \(\beta_1\) performed even more poorly than the CI for the intercept. The vast majority of the simulations failed to capture the true value.

Average MSE
The average MSE for each non-linear relationship tested was wildly inaccurate as an estimator of the variance of the error, \(\sigma^2\), which was set to be 4 for each simulation.

P-values for \(\beta_0\) and \(\beta_1\)
The p-value for the intercept was usually found to be significant (less than \(\alpha = .05\)), regardless of the underlying non-linear relationship, especially if the sample size was high. In the exponentail and logarithmic simulations, the coefficent \(\beta_1\) was also usually significant, but the polynomial and sinusoidal simulations usually found that predictor to be insignificant.

PART E

TODO write interpretation

#part e, name the normal func & nonnormal func & randomly call it
random.func1 <- sample(c(poly_func, sin, exp, log), 1)

n <- sample(20:5000, 1)
random.violation1 <- sample(c("None","Nonlinear relationship"), 1)

random_results1 <- run_simulations(num_runs, 
                                  n, 
                                  num_new_obsv, 
                                  violation =  random.violation1,
                                  violation_func = random.func1,
                                  num_predictors, 
                                  sigma_sq)
random_results1[[1]]
plot_diagnostics(random_results1)
## Diagnostic #1: Is the Regression Function Linear?

## Diagnostic #2: Do the Residuals Have Constant Variance?
## 
##  Does the plot appear to be a general cloud of points or is there a megaphone shape?

## Diagnostic #3: Are the Residuals Normally Distributed?

## Diagnostic #4: Are the Residuals Independent?

  1. Non-normal errors
    1. Simulate errors from a variety of different non-normal distributions (e.g. gamma, poisson). Make sure to shift the errors over so that they are still centered at 0.
    2. Try simulations with a small sample size (e.g. 20), a medium sample size (e.g. n= 100), and a large sample size (e.g. n = 5000).
    3. For each simulation,
      1. Predict y-hat at several different locations using a confidence interval.
      2. Predict the beta coefficients for a linear model using a confidence interval.
      3. Find the MSE (to estimate sigma^2)
      4. Test to see whether the beta(s) are significant (t-tests)
    4. Which of the above tasks were affected by the violation of assumptions?
    5. After you have experimented with the effects of different model structures, true parameter values, and sample sizes, let’s repeat the simulation but test yourself to see whether you can detect non-normality.
      1. Have R randomly choose whether to simulate data with normal or nonnormal errors
      2. Simulate data accordingly and display informal/ formal diagnostics as appropriate.
      3. Based on the diagnostics, predict whether the problem areas you mentioned in part d will be affected or not. (Note: You are not predicting whether the assumptions are violated– just whether they are violated to such an extent that your ability to use the model is compromised) ## PARTS A, B, & C ### GAMMA
gamma_small_results <- run_simulations(num_runs, 
                                     n_small, 
                                     num_new_obsv, 
                                     violation =  "Non-normal errors", 
                                     violation_func = "Gamma",
                                     num_predictors, 
                                     sigma_sq)

gamma_med_results <- run_simulations(num_runs, 
                                   n_med, 
                                   num_new_obsv, 
                                   violation =  "Non-normal errors", 
                                   violation_func = "Gamma", 
                                   num_predictors, 
                                   sigma_sq)

gamma_large_results <- run_simulations(num_runs, 
                                     n_large, 
                                     num_new_obsv, 
                                     violation =  "Non-normal errors", 
                                     violation_func = "Gamma", 
                                     num_predictors, 
                                     sigma_sq)

POISSON

poisson_small_results <- run_simulations(num_runs, 
                                     n_small, 
                                     num_new_obsv, 
                                     violation =  "Non-normal errors", 
                                     violation_func = "Poisson",
                                     num_predictors, 
                                     sigma_sq)

poisson_med_results <- run_simulations(num_runs, 
                                   n_med, 
                                   num_new_obsv, 
                                   violation =  "Non-normal errors", 
                                   violation_func = "Poisson", 
                                   num_predictors, 
                                   sigma_sq)

poisson_large_results <- run_simulations(num_runs, 
                                     n_large, 
                                     num_new_obsv, 
                                     violation =  "Non-normal errors", 
                                     violation_func = "Poisson", 
                                     num_predictors, 
                                     sigma_sq)

PART D

(gamma_results <- rbind(small_samp = gamma_small_results[[1]], 
                             med_samp = gamma_med_results[[1]], 
                             large_samp = gamma_large_results[[1]]))
(poisson_results <- rbind(small_samp = poisson_small_results[[1]], 
                             med_samp = poisson_med_results[[1]], 
                             large_samp = poisson_large_results[[1]]))

Prediction confidence intervals
The confidence interval for the predictions performed very well. They failed about 5% of the time, which is to be expected since it is a 95% confidence interval.

Confidence interval for \(\beta_0\)
The confidence interval for the intercept performed very well, also failing about for about 5% of the simulations.

Confidence inverval for \(\beta_1\)
The confidence interval for \(\beta_1\) performed very well, also failing about for about 5% of the simulations.

Average MSE
The variance for the errors was always set to 4, regardless of which distribution was used to generate them. Both simulations using either the Gamma or Poisson distributed errors resulted in an average MSE of approximately 16, which is significantly higher than the true variance.

P-values for \(\beta_0\) and \(\beta_1\)
The simulations using the small sample size of 20 would often find the intercept to be insignificant. However, the coefficient for the predictor was always found to be significant, regardless of sample size or the distribution used to generate the errors.

PART E

TODO write interpretation

#part e, name the normal func & nonnormal func & randomly call it
random.func2 <- sample(c("Poisson","Gamma"), 1)
n <- sample(20:5000, 1)
random.violation2 <- sample(c("None","Non-normal errors"), 1)

random_results2 <- run_simulations(num_runs, 
                                  n, 
                                  num_new_obsv, 
                                  violation =  random.violation2,
                                  violation_func = random.func2,
                                  num_predictors, 
                                  sigma_sq)

random_results2[[1]]
plot_diagnostics(random_results2)
## Diagnostic #1: Is the Regression Function Linear?

## Diagnostic #2: Do the Residuals Have Constant Variance?
## 
##  Does the plot appear to be a general cloud of points or is there a megaphone shape?

## Diagnostic #3: Are the Residuals Normally Distributed?

## Diagnostic #4: Are the Residuals Independent?

  1. Heterogeneous Variances
    1. Simulate errors from a variety of different relationships with X (e.g. eps = 2 * sqrt(x))
    2. Try simulations with a small sample size (e.g. 20), a medium sample size (e.g. n= 100), and a large sample size (e.g. n = 5000).
    3. For each simulation,
      1. Predict y-hat at several different locations using a confidence interval.
      2. Predict the beta coefficients for a linear model using a confidence interval.
      3. Find the MSE (to estimate sigma^2– does that even make sense here?)
      4. Test to see whether the beta(s) are significant (t-tests)
    4. Which of the above tasks were affected by the violation of assumptions?
    5. After you have experimented with the effects of different model structures, true parameter values, and sample sizes, let’s repeat the simulation but test yourself to see whether you can detect heteroskedacity.
      1. Have R randomly choose whether to simulate errors with constant or non-constant variance
      2. Simulate data accordingly and display informal/ formal diagnostics as appropriate.
      3. Based on the diagnostics, predict whether the problem areas you mentioned in part d will be affected or not. (Note: You are not predicting whether the assumptions are violated– just whether they are violated to such an extent that your ability to use the model is compromised)

PARTS A, B, & C

poly_func_2 <- function(x) 0.5*x^2
hetero_var_poly_small_results <- run_simulations(num_runs,
                                     n_small, 
                                     num_new_obsv, 
                                     violation =  "Heterogeneous variances", 
                                     violation_func = poly_func_2, 
                                     num_predictors, 
                                     sigma_sq)

hetero_var_poly_med_results <-run_simulations(num_runs, 
                                  n_med, 
                                  num_new_obsv, 
                                  violation =  "Heterogeneous variances",
                                  violation_func = poly_func_2, 
                                  num_predictors, 
                                  sigma_sq)

hetero_var_poly_large_results <-run_simulations(num_runs,
                                    n_large, 
                                    num_new_obsv, 
                                    violation =  "Heterogeneous variances", 
                                    violation_func = poly_func_2, 
                                    num_predictors, 
                                    sigma_sq)

hetero_var_exp_small_results <- run_simulations(num_runs,
                                     n_small, 
                                     num_new_obsv, 
                                     violation =  "Heterogeneous variances", 
                                     violation_func = exp, 
                                     num_predictors, 
                                     sigma_sq)

hetero_var_exp_med_results <-run_simulations(num_runs, 
                                  n_med, 
                                  num_new_obsv, 
                                  violation =  "Heterogeneous variances",
                                  violation_func = exp, 
                                  num_predictors, 
                                  sigma_sq)

hetero_var_exp_large_results <-run_simulations(num_runs,
                                    n_large, 
                                    num_new_obsv, 
                                    violation =  "Heterogeneous variances", 
                                    violation_func = exp, 
                                    num_predictors, 
                                    sigma_sq)

PART D

hetero_var_poly_results <- rbind(small_samp = hetero_var_poly_small_results[[1]],
                             med_samp = hetero_var_poly_med_results[[1]], 
                             large_samp = hetero_var_poly_large_results[[1]])
hetero_var_poly_results
hetero_var_exp_results <- rbind(small_samp = hetero_var_exp_small_results[[1]],
                             med_samp = hetero_var_exp_med_results[[1]], 
                             large_samp = hetero_var_exp_large_results[[1]])
hetero_var_exp_results

Prediction confidence intervals

Confidence interval for \(\beta_0\)
Confidence inverval for \(\beta_1\)
Average MSE
P-values for \(\beta_0\) and \(\beta_1\)

PART E

TODO write interpretation

#part e, name the normal func & nonnormal func & randomly call it
random_poly_func <- function(x) {
  a <- runif(1,0,5)
  b <- runif(1,2,4)
  return((x-a)^b)
}
n <- sample(20:5000, 1)
random.violation3 <- sample(c("None","Heterogeneous variances"), 1)

random_results3 <- run_simulations(num_runs, 
                                  n, 
                                  num_new_obsv, 
                                  violation =  random.violation3,
                                  violation_func = random_poly_func,
                                  num_predictors, 
                                  sigma_sq)

random_results3[[1]]
#plot_diagnostics(random_results3)
  1. Correlated Errors
    1. Simulate errors from a variety of different correlation structures.
    2. Try simulations with a small sample size (e.g. 20), a medium sample size (e.g. n= 100), and a large sample size (e.g. n = 5000).
    3. For each simulation,
      1. Predict y-hat at several different locations using a confidence interval.
      2. Predict the beta coefficients for a linear model using a confidence interval.
      3. Find the MSE (to estimate sigma^2)
      4. Test to see whether the beta(s) are significant (t-tests)
    4. Which of the above tasks were affected by the violation of assumptions?
    5. After you have experimented with the effects of different model structures, true parameter values, and sample sizes, let’s repeat the simulation but test yourself to see whether you can detect correlated errors.
      1. Have R randomly choose whether to simulate data with correlated or uncorrelated errors.
      2. Simulate data accordingly and display informal/ formal diagnostics as appropriate.
      3. Based on the diagnostics, predict whether the problem areas you mentioned in part d will be affected or not. (Note: You are not predicting whether the assumptions are violated– just whether they are violated to such an extent that your ability to use the model is compromised) ## PARTS A, B, & C
# Sample code to get started
corr_error_small_results_h <- run_simulations(num_runs,
                                     n_small, 
                                     num_new_obsv, 
                                     violation =  "Correlated Errors", 
                                     violation_func = 0.9, 
                                     num_predictors, 
                                     sigma_sq)

corr_error_med_results_h <-run_simulations(num_runs, 
                                  n_med, 
                                  num_new_obsv, 
                                  violation =  "Correlated Errors",
                                  violation_func = 0.9, 
                                  num_predictors, 
                                  sigma_sq)

corr_error_large_results_h <-run_simulations(num_runs,
                                    n_large, 
                                    num_new_obsv, 
                                    violation =  "Correlated Errors", 
                                    violation_func = 0.9, 
                                    num_predictors, 
                                    sigma_sq)

#correlated errors with rho of .3
corr_error_small_results_l <- run_simulations(num_runs,
                                     n_small, 
                                     num_new_obsv, 
                                     violation =  "Correlated Errors", 
                                     violation_func = 0.3, 
                                     num_predictors, 
                                     sigma_sq)

corr_error_med_results_l <-run_simulations(num_runs, 
                                  n_med, 
                                  num_new_obsv, 
                                  violation =  "Correlated Errors",
                                  violation_func = 0.3, 
                                  num_predictors, 
                                  sigma_sq)

corr_error_large_results_l <-run_simulations(num_runs,
                                    n_large, 
                                    num_new_obsv, 
                                    violation =  "Correlated Errors", 
                                    violation_func = 0.3, 
                                    num_predictors, 
                                    sigma_sq)

PART D

corr_error_results_h <- rbind(small_samp = corr_error_small_results_h[[1]],
                             med_samp = corr_error_med_results_h[[1]], 
                             large_samp = corr_error_large_results_h[[1]])
corr_error_results_h
corr_error_results_l <- rbind(small_samp = corr_error_small_results_l[[1]],
                             med_samp = corr_error_med_results_l[[1]], 
                             large_samp = corr_error_large_results_l[[1]])
corr_error_results_l

I compared different sample sizes with different values of Rho, the level of correlation in the error terms. After running the simulations, several tasks were violated. Most notably, the value of the average MSE, which estimates the variance, in each case was greater than the true value of \(\sigma^2\)= 4. When Rho = .9, the average MSE was far greater and although not by much, when Rho =.2 the average MSE was greater as well. In the case where Rho was set to .9, the true value of \(\hat{y}\) was missing in over half of the simulations’ confidence intervals. Similarly, the ratio of simulations that failed to capture the true intercept was greater than 30% at each sample size level. Another violation appeared in the ratio of simulations that found the intercept to be insignificant, in the small and medium sample sizes this condition was violated over 39% of the time. Despite these violations, the ratio of simulations that failed to include the true value of \(\beta_{1}\) was very low, less than 10%. In addition, none of the simulations considered \(\beta_{1}\) to be insignificant. Strangely, the simulations with a value of Rho = .2 and a small sample size failed to capture the true \(\beta_{0}\) over 70% of the time.

PART E

#randomly select a value of rho equal to .7 or 0 from a large sample size
rho_vec <- c(0.7,0.0)
random_error_large_results_h <-run_simulations(num_runs,
                                    n_large, 
                                    num_new_obsv, 
                                    violation =  "Correlated Errors", 
                                    violation_func = sample(rho_vec,1), 
                                    num_predictors, 
                                    sigma_sq)

plot_diagnostics(random_error_large_results_h)
## Diagnostic #1: Is the Regression Function Linear?

## Diagnostic #2: Do the Residuals Have Constant Variance?
## 
##  Does the plot appear to be a general cloud of points or is there a megaphone shape?

## Diagnostic #3: Are the Residuals Normally Distributed?

## Diagnostic #4: Are the Residuals Independent?

  1. Multicollinearity
    1. Simulate predictors that are correlated with a variety of different correlation structures
    2. Try simulations with a small sample size (e.g. 20), a medium sample size (e.g. n= 100), and a large sample size (e.g. n = 5000).
    3. For each simulation,
      1. Predict y-hat at several different locations using a confidence interval.
      2. Predict the beta coefficients for a linear model using a confidence interval.
      3. Find the MSE (to estimate sigma^2)
      4. Test to see whether the beta(s) are significant (t-tests)
    4. Which of the above tasks were affected by the violation of assumptions?
      The simulation with the largest sample size was most affected by the violation of the assumptions. This makes sense since moderate multicollinearity should not really affect the model.
    5. After you have experimented with the effects of different model structures, true parameter values, and sample sizes, let’s repeat the simulation but test yourself to see whether you can detect collinearity.
      1. Have R randomly choose whether to simulate data with correlated or uncorrelated predictor variables (X).
      2. Simulate data accordingly and display informal/ formal diagnostics as appropriate.
      3. Based on the diagnostics, predict whether the problem areas you mentioned in part d will be affected or not. (Note: You are not predicting whether the assumptions are violated– just whether they are violated to such an extent that your ability to use the model is compromised) ## PARTS A, B, & C
add_noise <- function(x) x + rnorm(length(x))
multicolinearity_noise_small_results <- run_simulations(num_runs,
                                     n_small,
                                     num_new_obsv,
                                     violation =  "Multicolinearity",
                                     violation_func = add_noise,
                                     num_predictors = 2,
                                     sigma_sq)

multicolinearity_noise_med_results <- run_simulations(num_runs,
                                     n_med,
                                     num_new_obsv,
                                     violation =  "Multicolinearity",
                                     violation_func = add_noise,
                                     num_predictors = 2,
                                     sigma_sq)

multicolinearity_noise_large_results <- run_simulations(num_runs,
                                     n_large,
                                     num_new_obsv,
                                     violation =  "Multicolinearity",
                                     violation_func = add_noise,
                                     num_predictors = 2,
                                     sigma_sq)

poly_func_3 <- function(x) x^2
multicolinearity_poly_small_results <- run_simulations(num_runs,
                                     n_small,
                                     num_new_obsv,
                                     violation =  "Multicolinearity",
                                     violation_func = poly_func_3,
                                     num_predictors = 2,
                                     sigma_sq)

multicolinearity_poly_med_results <- run_simulations(num_runs,
                                     n_med,
                                     num_new_obsv,
                                     violation =  "Multicolinearity",
                                     violation_func = poly_func_3,
                                     num_predictors = 2,
                                     sigma_sq)

multicolinearity_poly_large_results <- run_simulations(num_runs,
                                     n_large,
                                     num_new_obsv,
                                     violation =  "Multicolinearity",
                                     violation_func = poly_func_3,
                                     num_predictors = 2,
                                     sigma_sq)

PART D

multicolinearity_noise_results <- rbind(small_samp = multicolinearity_noise_small_results[[1]],
                             med_samp = multicolinearity_noise_med_results[[1]],
                             large_samp = multicolinearity_noise_large_results[[1]])
multicolinearity_noise_results
multicolinearity_poly_results <- rbind(small_samp = multicolinearity_poly_small_results[[1]],
                             med_samp = multicolinearity_poly_med_results[[1]],
                             large_samp = multicolinearity_poly_large_results[[1]])
multicolinearity_poly_results

PART E

TODO write interpretation

#part e, name the normal func & nonnormal func & randomly call it
n <- sample(20:5000, 1)
random.violation5 <- sample(c("None","Multicolinearity"), 1)

random_results5 <- run_simulations(num_runs,
                                  n,
                                  num_new_obsv,
                                  violation =  random.violation5,
                                  violation_func = add_noise,
                                  num_predictors = 2,
                                  sigma_sq)

random_results5[[1]]
#plot_diagnostics(random_results5)
  1. Put it all together: Combine the code from the previous 5 parts. Have R randomly choose whether to generate data that violates one (or more) of the assumptions, or whether all the assumptions are valid. Show appropriate diagnostics and test yourself to see if you can predict whether there are problem areas or not. Repeat the simulation several times and record your accuracy at detecting the different problem areas.